setwd("~/Public/klarahan.github.io")
#Sys.setlocale(locale = "Korean")
Load libraries
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(quanteda)
## Package version: 2.1.2
## Parallel computing: 2 of 6 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
library(scales)
library(LSX)
## Registered S3 methods overwritten by 'quanteda.textstats':
## method from
## [.textstat quanteda
## as.data.frame.textstat_proxy quanteda
## as.list.textstat_proxy quanteda
## head.textstat_proxy quanteda
## tail.textstat_proxy quanteda
Load the data.
data_uni <- readRDS("data/data_uni_nouns_sample")
#Scaling Load the Korean sentiment lexicon.
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## term = col_character()
## )
##
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## term = col_character()
## )
See the top sentiment words.
tidy_news <- data_uni %>%
unnest_tokens(word, text)
tidy_news %>%
inner_join(senti) %>%
count(word, sort = TRUE)
## Joining, by = "word"
senti_news <- tidy_news %>%
inner_join(senti) %>%
count(Newspaper, Prezparty, sentiment) %>%
tidyr::spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
senti_word_counts <- tidy_news %>%
inner_join(senti) %>%
filter(Government == "1990-1993 Roh TW") %>%
count(word, sentiment, Government, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
senti_word_counts
Plot the top sentiment words.
senti_word_counts %>%
group_by(sentiment) %>%
top_n(25) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip() +
scale_fill_grey() +
theme_bw()
## Selecting by n

# Sys.setlocale(locale = "C")
dict_sentiment <- dictionary(list(positive = c("구상", "자유", "평화", "협력", "지원", "번영"),
negative = c("투쟁", "대립", "무산", "세력", "비판", "분열")))
dict_sentiment
## Dictionary object with 2 key entries.
## - [positive]:
## - 구상, 자유, 평화, 협력, 지원, 번영
## - [negative]:
## - 투쟁, 대립, 무산, 세력, 비판, 분열
# tokenize text corpus and remove various features
corp_sent <- data_uni %>%
unique() %>%
mutate(Body = gsub("</?[^>]+>|▲ 종이신문보기", "", Body)) %>%
corpus() %>%
corpus_reshape(to = "sentences")
toks <- corp_sent %>%
tokens()
# create a document feature matrix from the tokens object
dfmat <- toks %>%
dfm(remove = "") %>%
dfm_trim(min_termfreq = 5)
topfeatures(dfmat, 20)
## 북한 통일 우리 정부 문제 남북 대통령 한국 경제 중국 회담
## 129413 94417 72746 68951 64971 54060 53658 52417 44903 39960 39947
## 정치 의원 정책 미국 장관 국가 기자 사람 관계
## 38322 36757 35157 35152 34474 33897 33403 32760 32724
seed <- as.seedwords(dict_sentiment)
seed
## 구상 자유 평화 협력 지원 번영 투쟁 대립 무산 세력 비판 분열
## 1 1 1 1 1 1 -1 -1 -1 -1 -1 -1
# identify context words
context_terms <- char_context(toks, pattern = "*통일*", p = 0.05)
# run LSS model
tmod_lss <- textmodel_lss(dfmat, seeds = seed,
terms = context_terms, k = 300, cache = TRUE)
## Reading cache file: lss_cache/svds_fb535efe918180cd.RDS
Look up key periods
#Sys.setlocale(locale = "Korean")
data_uni %>%
filter(grepl("회장", text)) %>%
select(Body)